pegRNA Library Comparison

Author

Lance Parsons

Read data

  1. Get list of samples
Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
  1. Read Samtools idxstats to get human coverage for normalization

Notes:

  • The counts include the total number of reads aligned, they are not limited to uniquely aligned reads.
  • The counts are reads, not pairs or fragments
Code
idxstats_exogenousrna_dir <-
  "results/samtools_idxstats/exogenous_rna/"

idxstats_human_dir <-
  "results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"

bowtie2_human_logs <-
  "results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"

idxstats <- tibble()

for (row in seq_len(nrow(sample_units))) {
  sample <- sample_units[row, ]$sample_unit

  # Read `idsxstats` for exogenous mapped reads
  exogenous_rna_stats <- read_tsv(
    file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
    dplyr::filter(!sequence_name %in% c("*")) %>%
    dplyr::select(sequence_name, mapped_reads) %>%
    dplyr::mutate(sample = sample)

  # Read `idxstats` for human mapped reads
  human_stats <- read_tsv(
    file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  grch38_mapped_reads <- human_stats %>%
    dplyr::filter(!sequence_name %in% c("*")) %>%
    dplyr::select(mapped_reads) %>%
    sum()
  grch38_mapped_reads <- tibble(
    sequence_name = "grch38_mapped_reads",
    mapped_reads = grch38_mapped_reads,
    sample = sample
  )

  # Read bowtie2 logs for unmapped reads
  bowtie2_log <- readLines(
    file.path(bowtie2_human_logs, sprintf("%s.log", sample))
  )
  total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
  total_reads <- total_pairs * 2
  unmapped_reads <- tibble(
    sequence_name = "unmapped",
    mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
    sample = sample
  )

  # Consolidate counts for rows
  idxstats <- rbind(
    idxstats,
    exogenous_rna_mapped_reads,
    grch38_mapped_reads,
    unmapped_reads
  )
}
idxstats

Sample Correlation

BAM Correlation

Divide the human genome into bins, determine the counts in each of those bins and calculate the correlation between samples.

Note: May need to remove zero count bins

Correlation of gene counts

Using featureCounts, we generated fragment counts for “exon” features in the Ensemble hg38 annotation and summed them per “gene_id”. Primary alignments were counted, even if the fragements were mulitmappers.

Code
feature_counts <- readr::read_tsv(
  paste0(
    "results/alignments/Homo_sapiens.GRCh38.dna.primary_assembly/",
    "featureCounts/all_counts.featureCounts"
  ),
  comment = "#",
  col_types = list(
    Geneid = col_character(),
    Chr = col_character(),
    Start = col_character(),
    End = col_character(),
    Strand = col_character(),
    .default = col_integer()
  )
) %>%
  rename_all(~ stringr::str_replace_all(
    ., "results/alignments/Homo_sapiens.GRCh38.dna.primary_assembly/sorted/", ""
  )) %>%
  rename_all(~ stringr::str_replace_all(., ".bam", ""))
feature_counts

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = feature_counts %>%
    tibble::column_to_rownames("Geneid") %>%
    dplyr::select(!Chr:Length),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
dds
class: DESeqDataSet 
dim: 62703 56 
metadata(1): version
assays(1): counts
rownames(62703): ENSG00000160072 ENSG00000279928 ... ENSG00000277475
  ENSG00000275405
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(8): sample_unit sample_name ... fq1 fq2

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  paste0("day", vsd$day),
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differetial Expression

Default DESeq2 size factors

These analyses use size factors calculated by DESeq2. They are based off the gene counts and attempt to account for outliers.

Calculate Differential Expression

Calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds <- DESeq(dds, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates, fitting model and testing: 4 workers
Code
res <- results(dds, alpha = 0.05)
res
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 62703 rows and 6 columns
                   baseMean log2FoldChange     lfcSE        stat       pvalue
                  <numeric>      <numeric> <numeric>   <numeric>    <numeric>
ENSG00000160072  11.9966463     -0.4677504  0.164874  -2.8370214   0.00455366
ENSG00000279928   0.1915711     -0.1072723  2.316584  -0.0463063   0.96306616
ENSG00000228037   0.0798637     -0.5309350  2.936431  -0.1808096   0.85651700
ENSG00000142611   3.1298205      0.0187025  0.387332   0.0482854   0.96148876
ENSG00000284616   0.1527401     -0.3778479  2.906701  -0.1299920   0.89657277
...                     ...            ...       ...         ...          ...
ENSG00000271254    4.307054       0.299222 0.2624723   1.1400143  2.54280e-01
ENSG00000275987    0.331977      -0.693435 1.6796699  -0.4128403  6.79724e-01
ENSG00000268674    0.215662      -0.119064 2.2253191  -0.0535042  9.57330e-01
ENSG00000277475  585.344191       1.047180 0.0472535  22.1608965 8.19039e-109
ENSG00000275405 6972.815646      -0.631537 0.0486461 -12.9822882  1.54204e-38
                        padj
                   <numeric>
ENSG00000160072    0.0325866
ENSG00000279928           NA
ENSG00000228037           NA
ENSG00000142611    0.9877092
ENSG00000284616           NA
...                      ...
ENSG00000271254  5.49745e-01
ENSG00000275987           NA
ENSG00000268674           NA
ENSG00000277475 3.58450e-106
ENSG00000275405  1.65473e-36
Code
summary(res)

out of 50184 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1550, 3.1%
LFC < 0 (down)     : 1892, 3.8%
outliers [1]       : 19, 0.038%
low counts [2]     : 27845, 55%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

MA Plot

Code
plotMA(res, ylim = c(-5, 5))

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plotCounts(dds, gene = which.min(res$padj), intgroup = "cell_line")

Run log fold change shrinkage to account for low counts.

Code
res_lfc <- lfcShrink(dds,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot of shrunken log2 fold changes

Code
plotMA(res_lfc, ylim = c(-5, 5))

Reads mapped to hg38 size factors

These analyses use size factors determined by the number of reads mapped to hg38. The size factor for a given sample is:

reads_mapped_to_hg38_sample / median(reads_mapped_to_hg38_all_samples)

This is similar to how we normalized the coverage plots (though admittedly, subtley different so that the factors scaled things similar to how DESeq2 usually works)

Code
hg38_size_factors <- idxstats %>%
  dplyr::filter(sequence_name == "grch38_mapped_reads") %>%
  dplyr::select(sample, mapped_reads) %>%
  deframe()
dds_custom_sf <- dds
sizeFactors(dds_custom_sf) <- (hg38_size_factors / median(hg38_size_factors))
dds_custom_sf
class: DESeqDataSet 
dim: 62703 56 
metadata(1): version
assays(4): counts mu H cooks
rownames(62703): ENSG00000160072 ENSG00000279928 ... ENSG00000277475
  ENSG00000275405
rowData names(38): baseMean baseVar ... deviance maxCooks
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(9): sample_unit sample_name ... fq2 sizeFactor

Calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_custom_sf <- DESeq(dds_custom_sf, parallel = TRUE)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
res_custom_sf <- results(dds_custom_sf, alpha = 0.05)
res_custom_sf
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 62703 rows and 6 columns
                   baseMean log2FoldChange     lfcSE         stat       pvalue
                  <numeric>      <numeric> <numeric>    <numeric>    <numeric>
ENSG00000160072  12.6829409     -0.3747049  0.177746  -2.10809438    0.0350228
ENSG00000279928   0.2130504     -0.0147461  2.182598  -0.00675621    0.9946094
ENSG00000228037   0.0724539     -0.4354615  2.936431  -0.14829617    0.8821090
ENSG00000142611   3.2863534      0.0905906  0.414356   0.21862975    0.8269385
ENSG00000284616   0.1747614     -0.2904120  2.924486  -0.09930361    0.9208972
...                     ...            ...       ...          ...          ...
ENSG00000271254    4.540355      0.3961267 0.2651908   1.49374209  1.35243e-01
ENSG00000275987    0.343530     -0.5780053 1.6615547  -0.34787015  7.27938e-01
ENSG00000268674    0.220682     -0.0173511 2.1638339  -0.00801867  9.93602e-01
ENSG00000277475  648.696012      1.1404347 0.0537192  21.22955922 5.09380e-100
ENSG00000275405 7385.382560     -0.5362404 0.0512343 -10.46643555  1.23195e-25
                       padj
                  <numeric>
ENSG00000160072    0.167593
ENSG00000279928          NA
ENSG00000228037          NA
ENSG00000142611    0.948023
ENSG00000284616          NA
...                     ...
ENSG00000271254 4.08052e-01
ENSG00000275987          NA
ENSG00000268674          NA
ENSG00000277475 2.37450e-97
ENSG00000275405 1.02974e-23
Code
summary(res_custom_sf)

out of 50184 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 2026, 4%
LFC < 0 (down)     : 1382, 2.8%
outliers [1]       : 19, 0.038%
low counts [2]     : 25925, 52%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Code
plotMA(res_custom_sf, ylim = c(-5, 5))

Code
plotCounts(dds_custom_sf,
  gene = which.min(res_custom_sf$padj),
  intgroup = "cell_line"
)

Shrink log fold changes

Code
res_lfc_custom_sf <- lfcShrink(dds_custom_sf,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
plotMA(res_lfc_custom_sf, ylim = c(-5, 5))

Gene Biotype

For each gene_id in the featureCounts table, we look up the gene_biotype from Ensembl.

Code
hg38 <- EnsDb.Hsapiens.v86
hg38_genes <- genes(hg38, return.type = "data.frame")

counts_by_biotype <- feature_counts %>%
  full_join(hg38_genes %>% dplyr::select(gene_id, gene_biotype),
    by = c("Geneid" = "gene_id")
  ) %>%
  group_by(gene_biotype) %>%
  dplyr::select("gene_biotype", !c(Geneid, Chr, Start, End, Strand, Length)) %>%
  summarise_all(list(count = sum)) %>%
  tidyr::pivot_longer(!gene_biotype, names_to = "sample", values_to = "count")
counts_by_biotype
Code
p <- ggplot(
  data = subset(counts_by_biotype, !is.na(count)),
  aes(x = sample, y = count, fill = gene_biotype)
) +
  geom_bar(stat = "identity", position = "fill") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
p